Let's look at a slightly more interesting visual example than character or digit classification. Classifying cars. We'll use the Cars dataset from Jonathan Krause et al. at Stanford -- 16,185 images in 196 classes:
http://ai.stanford.edu/~jkrause/cars/car_dataset.html
The goal is to show all the steps of exploration and tuning on a new dataset, including false starts, etc. I deliberately didn't read lots of papers on this specific dataset or application, to better simulate working on an unfamiliar problem. I also didn't refactor the code to be prettier or remove things that aren't really needed. If you want to see how to use a particular technique, there are more concise guides out there.
The plan going in:
Note: if you try to run this notebook, make sure you have lots of RAM (I have 16GB) -- it isn't very careful about memory use.
%load_ext autoreload
%autoreload 2
# system
import os
import glob
import itertools as it
import operator
from collections import defaultdict
from StringIO import StringIO
# other libraries
import numpy as np
import pandas as pd
import scipy.io # for loading .mat files
import scipy.misc # for imresize
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import seaborn as sns
import requests
# my code -- various helpful utilities
from display import visualize_keras_model, plot_training_curves
from helpers import combine_histories
%matplotlib inline
sns.set_style("white")
p = sns.color_palette()
# repeatability:
np.random.seed(42)
data_root = os.path.expanduser("~/data/cars")
I've already downloaded and unzipped the dataset. Let's see how it's organized.
os.listdir(data_root)
Let's look at the annotations first. It's a matlab file; luckily scipy has a function to load it.
cars_annos = scipy.io.loadmat(os.path.join(data_root, 'cars_annos.mat'))
cars_annos.keys()
Class names seems promising. Let's take a look.
cars_annos['class_names']
Funny structure. Let's turn it into a flat list.
classes = [x[0] for x in cars_annos['class_names'][0]]
classes[:10]
Looks good. Now let's split into (brand, model, type, year tuples). This will require a bit of manual munging...
first, years = zip(*[(s[:-5], s[-4:])
for s in classes])
# Check years...
sorted(pd.unique(years))
first[:5]
# Ok, let's pull out the car types
models, car_types = zip(*[(s.split()[:-1], s.split()[-1])
for s in first])
pd.unique(car_types)
Hmm. Car types are a bit messy -- many are multi-word. We'll want to clean this up later -- I don't know cars well enough to classify without seeing example images, so let's go to that.
images_dir = os.path.join(data_root, "car_ims")
len(os.listdir(images_dir))
Ok, so we have our 16K images. Let's look at a few random ones.
image_paths[12]
image_paths = os.listdir(images_dir)
for i in [12, 35, 3600, 12345]:
img = Image.open(os.path.join(images_dir, image_paths[i]))
size = img.size # save, since we're about to change it
img.thumbnail((128, 128)) # mostly to make notebook file smaller
fig, ax = plt.subplots(figsize=(3,2))
ax.imshow(img)
ax.set_title('shape: '+ str(size))
ax.grid(False)
Ok, we can read the images. They appear to be of vastly different sizes. Let's take a closer look.
shapes = [Image.open(os.path.join(images_dir, path)).size
for path in image_paths]
No errors! It's nice to have clean data...
widths = [s[0] for s in shapes]
aspect_ratios = [s[0]/float(s[1]) for s in shapes]
fig, ax = plt.subplots(figsize=(4,2.5))
ax.set_title("Image widths")
ax.hist(widths, bins=30)
sns.despine(fig)
fig, ax = plt.subplots(figsize=(4,2.5))
ax.set_title("Aspect ratios")
ax.hist(aspect_ratios, bins=30);
sns.despine(fig)
Essentially all images are in landscape orientation, and aren't too big--less than 1500px wide. A significant number are pretty small -- just a few hundred pixels. That's probably ok for us -- we'll want scale down for performance reasons anyway.
Ok, now let's get the classes for each image -- we need to look at the annotations part of the dict...
len(cars_annos['annotations'])
len(cars_annos['annotations'][0])
cars_annos['annotations'][0][0]
# what are the fields?
cars_annos['annotations'].dtype
# get rid of the nested arrays
from collections import namedtuple
Example = namedtuple('Example',
['rel_path', 'x1', 'y1', 'x2','y2','cls','test'])
# silly nested nested lists...
examples = [Example(*[a.flatten()[0] for a in x])
for x in cars_annos['annotations'][0]]
examples[0]
Ok, that worked. Now let's look at a couple of images from each class.
key_fn = operator.attrgetter('cls')
by_class = {} # key -> lst
for cls, group in it.groupby(sorted(examples, key=key_fn), key_fn):
by_class[cls] = list(group)
sorted(by_class.keys())[:5] # note: classes start at 1, not 0
# 196 = 14*14
fig, plots = plt.subplots(14,14, sharex='all', sharey='all',
figsize=(28,28))
for i in range(196):
# read the image
rel_path = by_class[i+1][0].rel_path
# Note: rel_paths include 'car_ims/'
img = Image.open(os.path.join(data_root, rel_path))
img = img.resize((100,100))
plots[i // 14, i % 14].axis('off')
plots[i // 14, i % 14].imshow(img)
Pretty! :)
# Which classes have the most examples?
counts = sorted([(k, classes[k-1], len(by_class[k]))
for k in by_class.keys()],
reverse=True,
key=operator.itemgetter(2))
print("5 most frequent:\n")
print("\n".join(map(str, counts[:5])))
print("\n5 least frequent:\n")
print("\n".join(map(str, counts[-5:])))
fig, ax = plt.subplots(figsize=(4,2.5))
ax.plot(range(len(counts)), [x[2] for x in counts])
ax.set_title("Sorted counts by class")
sns.despine(fig)
Ok, now that we have a sense of what the dataset looks like, let's do some preprocessing.
# 12 bytes per pixel. Size in MB.
16100 * 227 * 227 * 12 / 1024 / 1024
9.5G is a bit much. Let's keep in jpg for now, and decode on the fly. (Could try both and see what's faster...)
resized_path = os.path.join(data_root,'resized_car_ims')
if not os.path.exists(resized_path):
os.mkdir(resized_path)
# Resize all the things. Will take a little while.
for fname in image_paths:
new_path = os.path.join(resized_path, fname)
# skip if already exists.
# (Blow away whole directory if there's a problem, or to change
# resize strategy)
if not os.path.exists(new_path):
img = Image.open(os.path.join(images_dir, fname))
img = img.resize((227,227))
img.save(new_path)
# double check
len(os.listdir(resized_path))
!du -shc {resized_path} {images_dir}
Get a smaller dataset to process. Will make for faster processing.
Ok, now we have standard size images and roughly understand what things look like. Let's start with a toy problem to get our code working.
Problem 1: distinguishing Hummers from the Acura sedan.
These happen to be the first two classes, and it seems to be a relatively simple task -- certainly for humans.
# here are our classes
classes[:2]
# and here are a few images for each
fig, plots = plt.subplots(2,6, sharex='all', sharey='all',
figsize=(24,6))
for i in range(2):
for j in range(6):
# read the image
rel_path = by_class[i+1][j].rel_path
# Note: rel_paths include 'car_ims/'
img = Image.open(os.path.join(data_root, rel_path))
img = img.resize((100,100))
plots[i, j].axis('off')
plots[i, j].imshow(img)
# How many examples do we have?
fig, ax = plt.subplots()
ax.bar([1,2], [len(by_class[i]) for i in [1,2]], tick_label=["Hummer", "Acura"], align="center")
sns.despine(fig)
Let's get our training, validation, and test data prepared. We'll just keep it in-memory, at least for now.
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
# https://github.com/fchollet/keras/issues/4499
from keras.layers.core import K
from keras.callbacks import TensorBoard
# for name scopes to make TensorBoard look prettier
# (doesn't work well yet as of Keras 1.x -- maybe better in 2.x)
import tensorflow as tf
def gray_to_rgb(im):
"""
Noticed (due to array projection error in code below) that there is at least
one grayscale image in the dataset.
We'll use this to convert.
"""
w, h = im.shape
ret = np.empty((w,h,3), dtype=np.uint8)
ret[:,:,0] = im
ret[:,:,1] = im
ret[:,:,2] = im
return ret
def load_examples(by_class, cls, limit=None):
"""
Load examples for a class. Ignores test/train distinction --
we'll do our own train/validation/test split later.
Args:
by_class: our above dict -- class_id -> [Example()]
cls: which class to load
limit: if not None, only load this many images.
Returns:
list of (X,y) tuples, one for each image.
X: 227x227x3 ndarray of type uint8
Y: class_id (will be equal to cls)
"""
res = []
to_load = by_class[cls]
if limit:
to_load = to_load[:limit]
for ex in to_load:
# load the resized image!
img_path = os.path.join(data_root,
ex.rel_path.replace('car_ims', 'resized_car_ims'))
img = mpimg.imread(img_path)
# handle any grayscale images
if len(img.shape) == 2:
img = gray_to_rgb(img)
res.append((img, cls))
return res
Train network on training data, tune parameters using validation set, and finally see how well we did on the test set.
def split_examples(xs, valid_frac, test_frac):
"""
Randomly splits the xs array into train, valid, test, with specified
percentages. Rounds down.
Returns:
(train, valid, test)
"""
assert valid_frac + test_frac < 1
n = len(xs)
valid = int(valid_frac * n)
test = int(test_frac * n)
train = n - valid - test
# don't change passed-in list
shuffled = xs[:]
np.random.shuffle(shuffled)
return (shuffled[:train],
shuffled[train:train + valid],
shuffled[train + valid:])
# quick test
split_examples(range(10), 0.2, 0.4)
valid_frac = 0.2
test_frac = 0.2
# load the Hummer and acura images
(train, valid, test) = split_examples(load_examples(by_class, 1),
valid_frac, test_frac)
(train2, valid2, test2) = split_examples(load_examples(by_class, 2),
valid_frac, test_frac)
train.extend(train2)
valid.extend(valid2)
test.extend(test2)
# ...and shuffle to make training work better.
np.random.shuffle(train)
np.random.shuffle(valid)
np.random.shuffle(test)
# We have lists of (X,Y) tuples. Let's unzip into lists of Xs and Ys.
X_train, Y_train = zip(*train)
X_valid, Y_valid = zip(*valid)
X_test, Y_test = zip(*test)
# and turn into np arrays of the right dimension.
def convert_X(xs):
'''
Take list of (w,h,3) images.
Turn into an np array, change type to float32.
'''
return np.array(xs).astype('float32')
X_train = convert_X(X_train)
X_valid = convert_X(X_valid)
X_test = convert_X(X_test)
# Note that despite lots of Keras examples online that want the data to
# have shape 3,w,h, we actually want w,h,3 when using the TensorFlow
# backend.
X_train.shape
Two notes here:
First, we need to make sure they're sequential starting from 0.
all_ys = sorted(set(Y_train).union(set(Y_valid)).union(set(Y_test)))
n_classes = len(all_ys)
mapping = dict(zip(all_ys, range(n_classes)))
mapping
def convert_Y(ys, mapping):
'''
Convert to np array, make class values sequential from 0,
and make one-hot
'''
ret = np.array([mapping[y] for y in ys])
n_classes = len(mapping)
return np_utils.to_categorical(ret, n_classes)
Y_train = convert_Y(Y_train, mapping)
Y_valid = convert_Y(Y_valid, mapping)
Y_test = convert_Y(Y_test, mapping)
Y_train.shape
Let's build a simple logistic classifier. We'll convert to grayscale to reduce number of params by factor of 3.
X_train[0].mean(axis=2).shape
def normalize_to_gray(xs):
"""Convert vals to [0,1], reshape into flat vector.
Just averaging the three channels, even though that's not the optimal way to do it.
"""
ret = (xs / 255.0).mean(axis=3)
return np.reshape(ret, (-1, 227*227))
X_train_gray = normalize_to_gray(X_train)
X_valid_gray = normalize_to_gray(X_valid)
X_test_gray = normalize_to_gray(X_test)
X_train_gray[0].shape
# (1-x) because grayscale images are backwards--1 is black, it appears
plt.imshow((1 - X_train_gray[0]).reshape((227,227)))
def logistic_model():
model = Sequential()
# should really use lower resolution if we're making this silly
# of a model
model.add(Dense(output_dim=2, input_dim=227*227))
model.add(Activation('softmax'))
return model
model = logistic_model()
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
Y_train.shape
history = model.fit(X_train_gray, Y_train,
batch_size=16, nb_epoch=100, verbose=1,
validation_data=(X_valid_gray, Y_valid))
plot_training_curves(history.history);
score = model.evaluate(X_test_gray, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
predict_train = model.predict(X_train_gray)
predict_valid = model.predict(X_valid_gray)
predict_test = model.predict(X_test_gray)
Surprise! It's hugely overfitting, with 50K parameters and a couple of hundred images. Not very useful.
We'll see if a simple conv net will do a bit better. Then we can try data augmentation, regularization, and perhaps making our problem simpler by using the bounding boxes. First though, let's put together a bit more infrastructure:
# Look at training data -- there's so little we can look at all of it
def plot_data(xs, ys, predicts):
"""Plot the images in xs, with corresponding correct labels
and predictions.
Args:
xs: RGB or grayscale images with float32 values in [0,1].
ys: one-hot encoded labels
predicts: probability vectors (same dim as ys, normalized e.g. via softmax)
"""
# sort all 3 by ys
xs, ys, ps = zip(*sorted(zip(xs, ys, predicts),
key=lambda tpl: tpl[1][0]))
n = len(xs)
rows = (n+9)/10
fig, plots = plt.subplots(rows,10, sharex='all', sharey='all',
figsize=(20,2*rows), squeeze=False)
for i in range(n):
# read the image
ax = plots[i // 10, i % 10]
ax.axis('off')
img = xs[i].reshape(227,227,-1)
if img.shape[-1] == 1: # Grayscale
# Get rid of the unneeded dimension
img = img.squeeze()
# flip grayscale:
img = 1-img
ax.imshow(img)
# dot with one-hot vector picks out right element
pcorrect = np.dot(ps[i], ys[i])
if pcorrect > 0.8:
color = "blue"
else:
color = "red"
ax.set_title("{} p={:.2f}".format(int(ys[i][0]), pcorrect),
loc='center', fontsize=18, color=color)
return fig
fig = plot_data(X_train_gray, Y_train, predict_train)
fig.suptitle("Train")
fig = plot_data(X_valid_gray, Y_valid, predict_valid)
fig.suptitle("Valid")
fig = plot_data(X_test_gray, Y_test, predict_test)
fig.suptitle("Test")
Looking at this really highlights how little data we have. Probably not worth using 40% for validation and test. Cross-validation would be better, and with this little data, would probably be fast enough. Can definitely benefit from data augmentation too.
On the other hand, looks like the images are pretty well centered -- bounding boxes might help a bit, but there are very few gross scale differences.
While we're building infrastructure, let's change above plots to include whether we got each example right and wrong. [done]
We'll use three conv layers, then fully connected one. We'll use dropout to try to fight overfitting...
# normalize the data, this time leaving it in color
def normalize_for_cnn(xs):
ret = (xs / 255.0)
return ret
X_train_norm = normalize_for_cnn(X_train)
X_valid_norm = normalize_for_cnn(X_valid)
X_test_norm = normalize_for_cnn(X_test)
X_train_norm.shape
def cnn_model(use_dropout=True):
model = Sequential()
nb_filters = 16
pool_size = (2,2)
filter_size = 3
nb_classes = 2
with tf.name_scope("conv1") as scope:
model.add(Convolution2D(nb_filters, filter_size,
input_shape=(227, 227, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("conv2") as scope:
model.add(Convolution2D(nb_filters, filter_size))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("conv3") as scope:
model.add(Convolution2D(nb_filters, filter_size))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("dense1") as scope:
model.add(Flatten())
model.add(Dense(16))
model.add(Activation('relu'))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("softmax") as scope:
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
return model
# Uncomment if getting a "Invalid argument: You must feed a value
# for placeholder tensor ..." when rerunning training.
# K.clear_session() # https://github.com/fchollet/keras/issues/4499
model2 = cnn_model()
model2.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
print(model2.summary())
recompute = False
if recompute:
# Save info during computation so we can see what's happening
tbCallback = TensorBoard(
log_dir='./graph', histogram_freq=1,
write_graph=False, write_images=False)
# Fit the model!
history = model2.fit(
X_train_norm, Y_train,
batch_size=16, nb_epoch=100, verbose=1,
validation_data=(X_valid_norm, Y_valid),
callbacks=[tbCallback]
)
else:
model2.load_weights('hummer_acura_simple_cnn.h5')
model2.save('hummer_acura_simple_cnn.h5')
plot_training_curves(history.history);
score = model2.evaluate(X_test_norm, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
Hmm. Seems to work remarkably well, given how little data we have! Yay for dropout. We can move on to a more interesting problem, but first, let's look at a few things:
First, let's take a look at the results:
predict_train = model2.predict(X_train_norm)
predict_valid = model2.predict(X_valid_norm)
predict_test = model2.predict(X_test_norm)
fig = plot_data(X_train_norm, Y_train, predict_train)
fig.suptitle("Train")
fig = plot_data(X_valid_norm, Y_valid, predict_valid)
fig.suptitle("Valid")
fig = plot_data(X_test_norm, Y_test, predict_test)
fig.suptitle("Test")
Ok, now let's see how the same model does without dropout. Given how little data we have, I'd expect it to do a lot worse...
model2_nodropout = cnn_model(use_dropout=False)
model2_nodropout.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
history = model2_nodropout.fit(X_train_norm, Y_train,
batch_size=16, nb_epoch=100, verbose=1,
validation_data=(X_valid_norm, Y_valid))
plot_training_curves(history.history);
score = model2_nodropoutropout.evaluate(X_test_norm, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
Indeed, without dropout the network quickly gets a perfect training score and does not generalize at all. It's really amazing that dropout lets us train ~180K parameters with only 95 training images in a generalizable way! (See this Geoff Hinton talk for some discussion of why this works...)
Let's look at a few images from the internet. I found these manually. To be scientific, we'd want to get more data. For now, just getting an initial sense.
def image_from_url(url):
response = requests.get(url)
img = Image.open(StringIO(response.content))
return img
hummer_urls = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/2002-09-11-Marbella-22.jpg/220px-2002-09-11-Marbella-22.jpg",
"https://s-media-cache-ak0.pinimg.com/originals/83/01/a0/8301a0d1d45e4a53e1a790fe79f6cbef.jpg",
"http://vignette1.wikia.nocookie.net/asphalt/images/2/26/Hummer_h1.jpg/revision/latest?cb=20150120143504",
]
acura_urls = [
"https://media.ed.edmunds-media.com/acura/rl/2012/oem/2012_acura_rl_sedan_base_fq_oem_6_500.jpg",
"https://media.ed.edmunds-media.com/acura/rl/2012/oem/2012_acura_rl_sedan_base_fq_oem_3_500.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/2005_Acura_RL_--_NHTSA.jpg/1200px-2005_Acura_RL_--_NHTSA.jpg"
]
hummers = map(image_from_url, hummer_urls)
acuras = map(image_from_url, acura_urls)
to_test = [img.resize((227,227))
for img in hummers + acuras]
to_test = [np.array(img).astype('float32')/255.0
for img in to_test]
to_test = np.array(to_test)
to_test.shape
predicts = model2.predict(to_test)
ys = np.array([[1,0],[1,0],[1,0],
[0,1],[0,1],[0,1]])
fig = plot_data(to_test, ys, predicts)
fig.suptitle("Internet test")
It seems to work ok. Just for fun, let's make sure we don't already have these images -- the original dataset was probably collected by googling too.
def cmp_images(a,b):
"""
Compare two images. Both must be numpy arrays, of same size,
float32 normalized to [0,1].
Approximate...
"""
if np.abs((a-b)).mean() < 0.1: # hack, but should be good enough
return True
return False
# mini test
cmp_images(X_train_norm[0], X_train_norm[0]), cmp_images(
X_train_norm[0], X_train_norm[1])
stop = False
for a in it.chain(X_train_norm, X_valid_norm, X_test_norm):
for b in to_test:
if cmp_images(a,b):
fig, plots = plt.subplots(1, 2)
plots[0].imshow(a)
plots[1].imshow(b)
Note that our classifier divides the world into hummer and acura, and has no conception of anything else. Let's see what it does with puppies...
puppy_urls = [
"http://cdn.earthporm.com/wp-content/uploads/2015/10/XX-Proud-Mommies5__605.jpg",
"https://ipetcompanion.com/feedapuppy/styles/media/puppy.jpg",
"https://i.ytimg.com/vi/PnY7WqoN4F8/hqdefault.jpg"
]
puppies = map(image_from_url, puppy_urls)
to_test = [img.resize((227,227))
for img in puppies]
to_test = [np.array(img).astype('float32')/255.0
for img in to_test]
to_test = np.array(to_test)
to_test.shape
predicts = model2.predict(to_test)
# We'll pretend that puppies are hummers, just to make the code happy
ys = np.array([[1,0],[1,0],[1,0]])
fig = plot_data(to_test, ys, predicts)
fig.suptitle("Puppies test")
And voila -- we have two Hummer puppies and an acura puppy. The lesson: either ensure that you're only feeding your classifier data from one of the expected classes, or prepare to handle inputs that don't belong. We can include a none-of-the-above class, or predict the prob of each class independently, without using a softmax, so they can all be low at once. Either way, we'd need to feed our network enough negative data.
As a final step before moving on to a more complex problem, let's add TensorBoard so we can look at what's happening in our network.
Ok, now we have basic pipeline working... let's try a more complex problem:
As a first step, let's finish up our car model classification into sedan, SUV, etc.
classes
Looks good. Now let's split into (brand, model, type, year tuples). This will require a bit of manual munging...
# pull out brands -- will need a bit of massaging for two-word names
# like aston martin and land rover
pd.unique([c.split()[0] for c in classes])
# pull out car types
pd.unique([c.split()[-2] for c in classes])
def parse_classes(classes):
"""
Return (id, brand, model, type, year) tuples.
Type will be one of:
* Sedan
* Convertible
* Coupe
* SUV
* Pickup
* Van
* Wagon
We may combine these further later...
"""
brands = [u'AM', u'Acura', u'Aston Martin', u'Audi', u'BMW', u'Bentley', u'Bugatti',
u'Buick', u'Cadillac', u'Chevrolet', u'Chrysler', u'Daewoo',
u'Dodge', u'Eagle', u'FIAT', u'Ferrari', u'Fisker', u'Ford', u'GMC',
u'Geo', u'HUMMER', u'Honda', u'Hyundai', u'Infiniti', u'Isuzu',
u'Jaguar', u'Jeep', u'Lamborghini', u'Land Rover', u'Lincoln', u'MINI',
u'Maybach', u'Mazda', u'McLaren', u'Mercedes-Benz', u'Mitsubishi',
u'Nissan', u'Plymouth', u'Porsche', u'Ram', u'Rolls-Royce',
u'Scion', u'Spyker', u'Suzuki', u'Tesla', u'Toyota', u'Volkswagen',
u'Volvo', u'smart']
car_types = ['SUV', 'Sedan', 'Convertible', 'Coupe',
'Wagon', 'Minivan', 'Van']
# Some types we'll just remap
# (Used Google image search to figure out some of these).
car_type_map = {'Hatchback': 'Wagon',
'SuperCab' : 'Pickup',
'Type-S' : 'Sedan',
'Type R' : 'Coupe',
'Cab' : 'Pickup',
'GS': 'Sedan',
'ZR1' : 'Coupe',
'Z06' : 'Coupe',
'HHR SS' : 'SUV',
'Cobalt SS': 'Coupe',
'TrailBlazer SS': 'SUV',
'300 SRT-8': 'Sedan',
'Challenger SRT8' : 'Coupe',
'Charger SRT-8': 'Sedan',
# different style coupe -- I expect difficulties with
# the fiat 500...
'500 Abarth': 'Coupe',
'Coupe IPL': 'Coupe',
'XKR' : 'Coupe',
'Superleggera' : 'Coupe'
}
ret = []
for i, cls in enumerate(classes):
words = cls.split()
year = words[-1]
if words[-2] in car_types:
car_type = words[-2]
elif words[-2] in car_type_map:
car_type = car_type_map[words[-2]]
else:
# look for last two words match
key = words[-3] + " " + words[-2]
if key in car_type_map:
car_type = car_type_map[key]
else:
print("Unknown car type: ", cls)
# make sure everything is unicode to avoid any
# issues later
car_type = unicode(car_type)
# just search
for b in brands:
if b in cls:
brand = b
brand_len = len(brand.split())
# this will be approximate, but I don't really care
# about the model, at least for now
model = " ".join(words[brand_len:-2])
# Careful! Class ids start at 1
ret.append((i+1, brand, model, car_type, year))
return ret
cls_tuples = parse_classes(classes)
cls_tuples
by_car_type = {} # type -> [tuples]
key_fn = operator.itemgetter(3)
for ct, group in it.groupby(sorted(cls_tuples, key=key_fn), key_fn):
by_car_type[ct] = list(group)
for k,v in sorted(by_car_type.items(), key=lambda x: len(x[1])):
print(k, len(v))
Hmm. Uneven number of car models per class. May make classification harder.
Ok, let's just try it with these classes -- load the images per class, give them appropriate labels.
To make the model smaller, let's start with just 20 images per car.
# make a mapping from class name to numbers we can one-hot encode
macro_classes = sorted(pd.unique([c[3] for c in cls_tuples]))
macro_class_map = dict((v,k) for (k,v) in enumerate(macro_classes))
macro_class_map
Let's save some metadata so we can load it in other notebooks -- this one is getting rather unwieldy.
import cPickle as pickle
save = True
if save:
with open('class_details.pkl','w') as f:
pickle.dump({'classes' : classes,
'examples' : examples,
'by_class' : by_class,
'by_car_type' : by_car_type,
'macro_classes' : macro_classes,
'macro_class_map': macro_class_map,
'cls_tuples': cls_tuples}, f)
IMG_PER_CAR = None # 20 # None to use all
valid_frac = 0.2
test_frac = 0.2
train = []
valid = []
test = []
for car_type, model_tuples in by_car_type.items():
macro_class_id = macro_class_map[car_type]
for model_tpl in model_tuples:
cls = model_tpl[0]
examples = load_examples(by_class, cls, limit=IMG_PER_CAR)
# replace class labels with the id of the macro class
examples = [(X, macro_class_id) for (X,y) in examples]
# split each class separately, so all have same fractions of
# train/valid/test
(cls_train, cls_valid, cls_test) = split_examples(
examples,
valid_frac, test_frac)
# and add them to the overall train/valid/test sets
train.extend(cls_train)
valid.extend(cls_valid)
test.extend(cls_test)
# ...and shuffle to make training work better.
np.random.shuffle(train)
np.random.shuffle(valid)
np.random.shuffle(test)
Still copy-pasting from above. Could refactor...
# We have lists of (X,Y) tuples. Let's unzip into lists of Xs and Ys.
X_train, Y_train = zip(*train)
X_valid, Y_valid = zip(*valid)
X_test, Y_test = zip(*test)
# and turn into np arrays of the right dimension.
def convert_X(xs):
'''
Take list of (w,h,3) images.
Turn into an np array, change type to float32.
'''
return np.array(xs).astype('float32')
X_train = convert_X(X_train)
X_valid = convert_X(X_valid)
X_test = convert_X(X_test)
X_train.shape
def convert_Y(ys, macro_classes):
'''
Convert to np array, make one-hot.
Already ensured they're sequential from zero.
'''
n_classes = len(macro_classes)
return np_utils.to_categorical(ys, n_classes)
Y_train = convert_Y(Y_train, macro_classes)
Y_valid = convert_Y(Y_valid, macro_classes)
Y_test = convert_Y(Y_test, macro_classes)
Y_train.shape
# normalize the data, this time leaving it in color
X_train_norm = normalize_for_cnn(X_train)
X_valid_norm = normalize_for_cnn(X_valid)
X_test_norm = normalize_for_cnn(X_test)
# Let's use more or less the same model to start (num classes changes)
def cnn_model2(use_dropout=True):
model = Sequential()
nb_filters = 16
pool_size = (2,2)
filter_size = 3
nb_classes = len(macro_classes)
with tf.name_scope("conv1") as scope:
model.add(Convolution2D(nb_filters, filter_size,
input_shape=(227, 227, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("conv2") as scope:
model.add(Convolution2D(nb_filters, filter_size))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("conv3") as scope:
model.add(Convolution2D(nb_filters, filter_size))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("dense1") as scope:
model.add(Flatten())
model.add(Dense(16))
model.add(Activation('relu'))
if use_dropout:
model.add(Dropout(0.5))
with tf.name_scope("softmax") as scope:
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
return model
# Uncomment if getting a "Invalid argument: You must feed a value
# for placeholder tensor ..." when rerunning training.
# K.clear_session() # https://github.com/fchollet/keras/issues/4499
model3 = cnn_model2()
model3.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
# This model will train slowly, so let's checkpoint it periodically
from keras.callbacks import ModelCheckpoint
recompute = False
if recompute:
# # Save info during computation so we can see what's happening
# tbCallback = TensorBoard(
# log_dir='./graph', histogram_freq=1,
# write_graph=False, write_images=False)
checkpoint = ModelCheckpoint('macro_class_cnn_checkpoint.5',
monitor='val_acc',
verbose=1,
save_best_only=True, mode='max',
save_weights_only=True)
# Fit the model! Using a bigger batch size and fewer epochs
# because we have ~10K training images now instead of 100.
history = model3.fit(
X_train_norm, Y_train,
batch_size=64, nb_epoch=50, verbose=1,
validation_data=(X_valid_norm, Y_valid),
callbacks=[checkpoint]
)
else:
model3.load_weights('macro_class_cnn.h5')
model3.save('macro_class_cnn.h5')
plot_training_curves(history.history);
# let's train some more -- clearly still getting better
history2 = model3.fit(
X_train_norm, Y_train,
batch_size=64, nb_epoch=50, verbose=1,
validation_data=(X_valid_norm, Y_valid),
callbacks=[checkpoint])
from helpers import combine_histories
plot_training_curves(combine_histories(history.history, history2.history));
The model is starting to overfit. Let's try to diagnose what's going on, then decide what to do.
Now that we have 8 different classes, we can see how often they get confused for each other by looking at the aptly named confusion matrix. I would expect lots of confusion between coupe and sedan, and van and minivan, and suv and wagon.
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(labels, predictions,
classes,
normalize=False,
title="Confusion matrix",
cmap=plt.cm.Blues):
"""
Plot a confusion matrix for predictions vs labels.
Both should be one-hot.
Based on.
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
# convert from one-hot
cat_labels = np.argmax(labels, axis=1)
cat_predicts = np.argmax(predictions, axis=1)
cm = confusion_matrix(cat_labels, cat_predicts)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in it.product(range(cm.shape[0]), range(cm.shape[1])):
if 0 < cm[i,j] < 1:
val = "{:.2f}".format(cm[i,j])
else:
val = cm[i,j]
plt.text(j, i, val,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Get the predictions
predict_train = model3.predict(X_train_norm)
predict_valid = model3.predict(X_valid_norm)
predict_test = model3.predict(X_test_norm)
plot_confusion_matrix(Y_test, predict_test, macro_classes,
normalize=False,
title="Test confusion matrix");
plot_confusion_matrix(Y_train, predict_train, macro_classes,
title="Train confusion matrix")
plot_confusion_matrix(Y_train, predict_train, macro_classes,
title="Train confusion matrix")
Well, it seems that most car types are classified as sedan. Not too surprising, especially given that sedans are overrepresented. It's starting to learn that SUVs and pickups are different from sedans, and occasionally manages to distinguish coupes from sedans.
So far, it doesn't use minivan, van, or wagon labels at all.
Things to check / do:
We'll pick up in 10b-cars-continued.ipynb...
plt.bar()